# sys, file and nav packages:
import datetime as dt
# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF
# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
import seaborn as sns
import matplotlib.gridspec as gridspec
# home brew utitilties
import resources.utility_functions as ut
import resources.abundance_classes as ac
import resources.chart_kwargs as ck
import resources.sr_ut as sut
# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex
import matplotlib.image as mpimg
# set some parameters:
today = dt.datetime.now().date().strftime("%Y-%m-%d")
start_date = '2020-03-01'
end_date ='2021-05-31'
sns.set_style('whitegrid')
a_fail_rate = 50
unit_label = 'p/100m'
reporting_unit = 100
# name of the output folder:
name_of_project = 'trans_report_all'
a_color = 'dodgerblue'
# set the maps
bassin_map = PILImage.open("resources/maps/survey_locations_all.jpeg")
bassin_pallette = {'rhone':'dimgray', 'aare':'salmon', 'linth':'tan', 'ticino':'steelblue', 'reuss':'purple'}
# define the feature level and components
comps = ['linth', 'rhone', 'aare', 'ticino']
comp_labels = {"linth":"Linth/Limmat", "rhone":"Rhône", 'aare':"Aare", "ticino":"Ticino/Cerisio", "reuss":"Reuss"}
comp_palette = {"Linth/Limmat":"dimgray", "Rhône":"tan", "Aare":"salmon", "Ticino/Cerisio":"steelblue", "Reuss":"purple"}
# explanatory variables:
luse_exp = ['% to buildings', '% to recreation', '% to agg', '% to woods', 'streets km', 'intersects']
# columns needed
use_these_cols = ['loc_date' ,
'% to buildings',
'% to trans',
'% to recreation',
'% to agg',
'% to woods',
'population',
'river_bassin',
'water_name_slug',
'streets km',
'intersects',
'length',
'groupname',
'code'
]
# these are default
top_name = ["All survey areas"]
# add the folder to the directory tree:
project_directory = ut.make_project_folder('output', name_of_project)
# get your data:
survey_data = pd.read_csv('resources/checked_sdata_eos_2020_21.csv')
river_bassins = ut.json_file_get("resources/river_basins.json")
dfBeaches = pd.read_csv("resources/beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")
dfDims = pd.read_csv("resources/corrected_dims.csv")
# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)
city_map = dfBeaches['city']
# map locations to feature names
location_wname_key = dfBeaches.water_name_slug
# map water_name_slug to water_name
wname_wname = dfBeaches[['water_name_slug','water_name']].reset_index(drop=True).drop_duplicates()
wname_wname.set_index('water_name_slug', inplace=True)
def make_plot_with_spearmans(data, ax, n):
"""Gets Spearmans ranked correlation and make A/B scatter plot. Must proived a
matplotlib axis object.
"""
sns.scatterplot(data=data, x=n, y=unit_label, ax=ax, color='black', s=30, edgecolor='white', alpha=0.6)
corr, a_p = stats.spearmanr(data[n], data[unit_label])
return ax, corr, a_p
dfCodes.set_index("code", inplace=True)
# these descriptions need to be shortened for display
dfCodes = sut.shorten_the_value(["G74", "description", "Insulation foams"], dfCodes)
dfCodes = sut.shorten_the_value(["G940", "description", "Foamed EVA for crafts and sports"], dfCodes)
dfCodes = sut.shorten_the_value(["G96", "description", "Sanitary-pads/tampons, applicators"], dfCodes)
dfCodes = sut.shorten_the_value(["G178", "description", "Metal bottle caps and lids"], dfCodes)
dfCodes = sut.shorten_the_value(["G82", "description", "Expanded foams 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G81", "description", "Expanded foams .5cm - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G117", "description", "Expanded foams < 5mm"], dfCodes)
dfCodes = sut.shorten_the_value(["G75", "description", "Plastic/foamed polystyrene 0 - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G76", "description", "Plastic/foamed polystyrene 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G24", "description", "Plastic lid rings"], dfCodes)
dfCodes = sut.shorten_the_value(["G33", "description", "Lids for togo drinks plastic"], dfCodes)
dfCodes = sut.shorten_the_value(["G3", "description", "Plastic bags, carier bags"], dfCodes)
dfCodes = sut.shorten_the_value(["G204", "description", "Bricks, pipes not plastic"], dfCodes)
dfCodes = sut.shorten_the_value(["G904", "description", "Plastic fireworks"], dfCodes)
dfCodes = sut.shorten_the_value(["G211", "description", "Swabs, bandaging, medical"], dfCodes)
# make a map to the code descriptions
code_description_map = dfCodes.description
# make a map to the code descriptions
code_material_map = dfCodes.material
20. Shared responsibility¶
Research on litter transport and accumulation in the aquatic environment indicates that rivers are a primary sources of land based macro-plastics to the marine environment [GFCH+21]. However not all objects that are transported by rivers make it to the ocean, suggesting that rivers and inland lakes are also sinks for a portion of the macro-plastics that are emitted [KBK+18].
Provisions in Swiss law, article 2 of the Federal Act on the Protection of the Environment (LPE) the principal of causality accounts for unlawful disposal of material and is commonly known as the principle of polluter payer. Ultimately, the responsibility of elimination and management of litter pollution in and along water systems is directly on the municipal and cantonal administrations [fdlCs20]as legally, they are owners of the land within their boundaries. The law does provide municipalities and cantons the ability to consider companies or persons further up the chain of causality, as producers of waste and to charge disposal fees to them (e.g. fast-food companies and similar businesses, or organizers of events that generate large quantities of waste on the public space) when specific offenders cannot be identified.
20.1. The Challenge¶
The challenge is to meet the requirements of objective criteria. The method to meet the requirements needs to be robust, transparent and easily repeatable.
output = io.BytesIO()
this_image = PILImage.open("resources/images/gclosmay2020.jpeg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
Obtaining objective criteria. Grand Clos, St. Gingoplh May 2020
The challenge is to extract as much information from the litter objects based on the quantity found, the properties of the object and the environmental variables in proximity of the survey location. Obtaining objective criteria from beach litter data is further complicated because of the \(\approx\) 61’000km of rivers and 1500 lakes in Switzerland [con21].
The hydrologic conditions of rivers have an effect on the distance and direction that litter, once introduced into a river will travel. Large, low density objects will most likely be transported to the next reservoir or area of reduced flow. High density objects will only be transported if the flow velocity and turbulence of the water are sufficient to keep the objects off the bottom. Once high density items enter a low velocity zone they tend to settle or sink [SLBH19].
The results from the latest national litter survey in Switzerland suggest that objects like cigarette ends and food wrappers are at least 21% of total objects recorded in 2020 and can be attributed to human behavior within 1500m of where it was found [ham21].
However, other objects have no definitive geographic source or no clear association with an activity in proximity to where they are found. The most common of these objects are \(\approx\) 40% of all litter items identified in 2020 -2021 [ham21]. Thus there is a clear incentive to identify litter objects that are of local origin and those that may have arrived at the survey location by some-other means.
20.2. Methods¶
The distribution of the survey results of two groups of objects were considered under two different land use profiles. The objects were grouped according to the results of the Spearmans ranked correlation test [sc21c] on the survey results of the most common objects with respect to land use conditions, see The land use profile for the complete results.
The most common objects were separated in two groups according to the correlation coefficient with respect to percent of land attributed to buildings:
Objects that have multiple positive associations to land use features and one association is to buildings
Objects that have few or no positive associations to land use features and no associations to buildings
Each distribution of both groups of objects were then evaluated for equality, difference and difference of mean using the Kolmogorov-Smirnov test [sc21a], Mann Whitney U test [sc21b] and bootsrapped permutation test for difference of means [dry20] under the two different conditions.
20.2.1. The objects¶
The most common objects are those objects that were either among the top ten most abundant objects or objects that were found in at least 50% of the surveys. The most common objects were first separated into two groups according to the previously cited criteria:
contributed (CG): objects that have multiple positive associations to land use features and one association is to buildings
Cigarette ends
Metal bottle tops
Snack wrappers
Glass bottles and pieces
distributed (DG): objects that have few or no positive associations to land use features
Fragmented expanded polystyrene
Plastic production pellets
Fragmented plastics
Cotton swabs
Industrial sheeting
Construction plastics
Note: cotton swabs are included with DG because they are usually introduced directly into a body of water after passing through a water treatment facility.
Distribtuion group (DG) objects
# read images
img_a = mpimg.imread('resources/codegroups/images/fragplass_dense_450_600.jpg')
img_b = mpimg.imread('resources/codegroups/images/fragfoam_450_600.jpg')
img_c = mpimg.imread('resources/codegroups/images/infrastructure_450_600.jpg')
img_d = mpimg.imread('resources/codegroups/images/gpis_450_600.jpg')
# display images
fig, ax = plt.subplots(2,2, figsize=(12,8))
axone=ax[0,0]
ut.hide_spines_ticks_grids(axone)
axone.imshow(img_a);
axone.set_title("Fragmented plastics", **ck.title_k14)
axtwo=ax[0,1]
ut.hide_spines_ticks_grids(axtwo)
axtwo.imshow(img_b);
axtwo.set_title("Fragmented foams", **ck.title_k14)
axthree=ax[1,0]
ut.hide_spines_ticks_grids(axthree)
axthree.imshow(img_c);
axthree.set_title("Construction plastics", **ck.title_k14)
axfour=ax[1,1]
ut.hide_spines_ticks_grids(axfour)
axfour.set_title("Plastic production pellets", **ck.title_k14)
axfour.imshow(img_d)
plt.tight_layout()
plt.show()
Two objects were selected from the CG and DG and tested under the same conditions
From CG: Food and tobacco (FT)
Cigarette ends
Snack Wrappers
From DG: Fragmented foams and plastics (FP)
Fragmented expanded polystyrene
Fragmented plastics
20.2.2. The conditions¶
The survey locations were grouped into two classes:
urban: locations that have a percent of land attributed to buildings GREATER than the median of all survey locations
rural: locations that have a percent of land attributed to buildings LESS than the median of all survey locations AND percent of land attributed to woods or agriculture greater than the median
The rural class had 148 surveys for 50 locations versus 152 surveys from 34 locations in the urban class.
20.2.3. The hypothesis¶
If there is no statistically significant evidence that a land use feature contributes to the accumulation of an object then the distribution of that object should be \(\approx\) under all land use conditions.
Null hypothesis: there is no statistically significant difference between survey results of DG objects at rural and urban locations.
alternate hypothesis: there is a statistically significant difference between survey results of DG objects at rural and urban locations.
20.3. The data¶
output = io.BytesIO()
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
# collect all the data and format
# make a loc date column before converting to date stamp
survey_data['loc_date'] = tuple(zip(survey_data.location, survey_data['date']))
# make date stamp
survey_data['date'] = pd.to_datetime(survey_data['date'], format="%Y-%m-%d")
# the land use data was unvailable for these municipalities
no_land_use = ['Walenstadt', 'Weesen', 'Glarus Nord', 'Quarten']
# slice the data by start and end date, remove the locations with no land use data
use_these_args = ((survey_data.date >= start_date)&(survey_data.date <= end_date))
survey_data = survey_data[use_these_args].copy()
# slice date to working data
a_data = survey_data[(~survey_data.city.isin(no_land_use))].copy()
# make a column for the reporting unit and convert
a_data[unit_label] = (a_data.pcs_m *reporting_unit).astype('int')
# survey totals
fd_dt = a_data.groupby(['loc_date','location','river_bassin', 'water_name_slug','city','date', 'month', 'eom'], as_index=False).agg({unit_label:'sum', 'quantity':'sum'})
#!! feature data!!#
fd = a_data.copy()
# column headers for the survey area data
a_data['survey area'] = a_data.river_bassin.map(lambda x: ut.use_this_key(x, comp_labels))
# map survey total quantity to loc_date
fd_dq = fd_dt[['loc_date', 'quantity']].set_index('loc_date')
fd_locs = fd.location.unique()
# gather the dimensional data for the time frame from dfDims
fd_dims= dfDims[(dfDims.location.isin(fd_locs))&(dfDims.date >= start_date)&(dfDims.date <= end_date)].copy()
# map the survey area name to the dims data record
m_ap_to_survey_area = fd[['location', 'river_bassin']].drop_duplicates().to_dict(orient='records')
a_new_map = {x['location']:x['river_bassin'] for x in m_ap_to_survey_area}
# make a survey area column in the dims data
fd_dims['survey area'] = fd_dims.location.map(lambda x: ut.use_this_key(x, a_new_map))
# map length and area from dims to survey data
st_map = fd_dims[['loc_date', 'area']].to_dict(orient='records')
amap = {x['loc_date']:{'area':x['area']}for x in st_map}
trans = {x:F"{x}"for x in fd.loc_date.unique()}
def this_map(x,amap,trans, var='length'):
try:
data = amap[trans[x]][var]
except:
data = 0
return data
# fd['length'] = fd.loc_date.map(lambda x: this_map(x,amap,trans, var='length'))
fd['area'] = fd.loc_date.map(lambda x: this_map(x,amap,trans, var='area'))
# fd['water'] = fd.location.map(lambda x: dfBeaches['water'][x])
# these surveys are missing area and length data.
# use the average values from all the surveys at that location to fill in the missing valuesf
make_lengths = fd.loc[fd.location.isin(['baby-plage-geneva','quai-maria-belgia'])].groupby('location').agg({'length':'mean', 'area':'mean'})
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'length'] = 84
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'area'] = 355
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'length'] = 34
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'area'] = 145
grtr_10 = fd.copy()
# dt_nw = grtr_10.groupby(use_these_cols[:-2], as_index=False).agg({unit_label:'sum'})
# the cumulative distributions for the different survey areas
ecdfs = {x:{} for x in comps}
ecdfs.update({"All surveys":{}})
for i, n in enumerate(luse_exp):
for element in comps:
the_data = ECDF(grtr_10[grtr_10.river_bassin == element][n].values)
ecdfs[element].update({n:the_data})
# x, y = the_data.x, the_data.y
a_all_surveys = ECDF(grtr_10[n].values)
ecdfs["All surveys"].update({n:a_all_surveys})
# summary stats table
# labels for the .describe function
change_names = {'count':'# samples',
'mean':F"average {unit_label}",
'std':'standard deviation',
'25%':'25%',
'50%':'50%',
'75%':'75%',
'max':F"max {unit_label}",
'min':F"min {unit_label}",
'total objects':'total objects',
'# locations':'# locations',
'survey year':'survey year'
}
def anew_dict(x):
new_dict = {}
for param in x.index:
new_dict.update({change_names[param]:x[param]})
return new_dict
#the most abundant codes
code_totals = a_data.groupby(['code'], as_index=False).agg({unit_label:'mean', 'quantity':'sum'})
# cumulative statistics for each code
code_totals["% of total"] = ((code_totals.quantity/code_totals.quantity.sum())*100).round(2)
code_totals["fail"] = code_totals.code.map(lambda x: a_data[(a_data.code == x) & (a_data.quantity > 0)].loc_date.nunique())
code_totals["fail rate"] = ((code_totals.fail/a_data.loc_date.nunique())*100).astype('int')
code_totals.set_index('code', inplace=True)
code_totals['material'] = code_totals.index.map(lambda x: code_material_map[x])
code_totals['item'] = code_totals.index.map(lambda x: code_description_map[x])
most_abundant = code_totals.sort_values(by="quantity", ascending=False)[:10].index
# found greater than 50% of the time
l_grtr_50 = code_totals[code_totals['fail rate'] >= 50].index
# the most common
most_common = list(set([*most_abundant, *l_grtr_50]))
# eleminate surveys less than 10m and greater than 100m
# restricts surveys to locations on lakes
grtr_10 = grtr_10[(grtr_10.w_t == 'l')].copy()
bld_med = grtr_10["% to buildings"].median()
agg_med = grtr_10["% to agg"].median()
wood_med = grtr_10["% to woods"].median()
# identify rural and urban location
grtr_10['rural'] = ((grtr_10["% to woods"] >= wood_med) | (grtr_10["% to agg"] >= agg_med) ) & (grtr_10["% to buildings"] < bld_med)
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == True, 'urban')
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == 'urban', 'rural')
# make a list of the objects by their association to buildings, streets recreation and thier use
# food and drink
cont = ["G27", "G30", "G178", "G200"]
# objects not related to tobacco or food
dist = list(set(most_common) - set(cont))
# lables for the two groups and a label to catch all the other objects
grtr_10['group'] = 'other'
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(dist), 'DG')
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(cont), 'CG')
# survey totals of all locations with its land use profile
initial = ['loc_date','date','rural','group', 'streets', 'intersects']
grtr_10dt=grtr_10.groupby(initial, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
grtr_10qkey = grtr_10dt[['loc_date', 'quantity']].set_index('loc_date')
# survey totals of contributed and distributed objects,
second = ['loc_date', 'group', 'rural', 'date','eom', 'river_bassin','location', 'streets', 'intersects']
grtr_10dtc=grtr_10.groupby(second, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
# adding the survey total of all objects to each record
grtr_10dtc['dt']= grtr_10dtc.loc_date.map(lambda x:fd_dq.loc[[x], 'quantity'][0])
# calculating the % total of contributed and distributed at each survey
grtr_10dtc['pt']= grtr_10dtc.quantity/grtr_10dtc.dt
less_than = grtr_10dtc[(grtr_10dtc['rural'] == 'rural')].location.unique()
grt_than = grtr_10dtc[(grtr_10dtc['rural'] == 'urban')].location.unique()
grt_dtr = grtr_10dtc.groupby(['loc_date', 'date','rural'], as_index=False)[unit_label].agg({unit_label:"sum"})
# summarize the data
nsamps = survey_data.loc_date.nunique()
fsamps = grtr_10.loc_date.nunique()
nlocs = survey_data.location.nunique()
astring = F"""
There were {nsamps} surveys at {nlocs} different locations. From those samples only the {fsamps} locations that
had a complete land use profile and that were located on lakes were considered.
"""
md(astring)
There were 386 surveys at 143 different locations. From those samples only the 300 locations that had a complete land use profile and that were located on lakes were considered.
Survey results urban and rural locations March 2020 - May 2021. Left: Survey totals urban v/s rural, n=300. Right: distribution of survey results urban - rural with detail of code-group results.
# months locator, can be confusing
# https://matplotlib.org/stable/api/dates_api.html
months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
days = mdates.DayLocator(interval=7)
fig, axs = plt.subplots(1,2, figsize=(11,6), sharey=True)
group_palette = {'CG':'magenta', 'DG':'teal', 'other':'tan'}
rural_palette = {'rural':'black', 'urban':'salmon' }
ax = axs[0]
sns.scatterplot(data=grt_dtr, x='date', y=unit_label, hue='rural', s=80, palette=rural_palette, alpha=0.6, ax=ax)
ax.set_ylim(0,grt_dtr[unit_label].quantile(.98)+50 )
ax.set_xlabel("")
ax.set_ylabel(unit_label, **ck.xlab_k14)
ax.xaxis.set_minor_locator(days)
ax.xaxis.set_major_formatter(months_fmt)
axtwo = axs[1]
box_props = {
'boxprops':{'facecolor':'none', 'edgecolor':'black'},
'medianprops':{'color':'black'},
'whiskerprops':{'color':'black'},
'capprops':{'color':'black'}
}
sns.boxplot(data=grt_dtr, x='rural', y=unit_label, dodge=False, showfliers=False, ax=axtwo, **box_props)
sns.stripplot(data=grtr_10dtc,x='rural', y=unit_label, ax=axtwo, zorder=1, hue='group', palette=group_palette, jitter=.35, alpha=0.3, s=8)
axtwo.set_ylabel(unit_label, **ck.xlab_k14)
ax.tick_params(which='both', axis='both', labelsize=14)
axtwo.tick_params(which='both', axis='both', labelsize=14)
axtwo.set_xlabel(" ")
plt.tight_layout()
plt.show()
plt.close()
Summary data all survey totals
change_names = {'count':'# samples',
'mean':F"average {unit_label}",
'std':'standard deviation',
'min p/50m':'min', '25%':'25%',
'50%':'50%', '75%':'75%',
'max':F"max {unit_label}", 'min':F"min {unit_label}",
'total objects':'total objects',
'# locations':'# locations',
'survey year':'survey year'
}
# convenience function to change the index names in a series
def anew_dict(x):
new_dict = {}
for param in x.index:
new_dict.update({change_names[param]:x[param]})
return new_dict
# select data
data = grt_dtr
# get the basic statistics from pd.describe
desc_2020 = data.groupby('rural')[unit_label].describe()
desc_2020.loc['all surveys', ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']] = grt_dtr.groupby(['loc_date', 'date'])[unit_label].sum().describe().to_numpy()
desc = desc_2020.astype('int')
desc = desc.applymap(lambda x: F"{x:,}")
desc.rename(columns={'count':'samples'}, inplace= True)
desc.reset_index(inplace=True)
# make tables
fig, axs = plt.subplots(figsize=(7,3.4))
# summary table
# names for the table columns
a_col = [top_name[0], 'total']
axone = axs
ut.hide_spines_ticks_grids(axone)
a_table = axone.table(cellText=desc.values, colLabels=desc.columns, colWidths=[.19,*[.1]*8], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,desc.values,desc.columns, s_et_bottom_row=False)
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
20.4. Differences between urban and rural survey totals¶
Survey results in rural locations had a lower median and mean than urban locations and all locations combined. The maximum and the minimum values as well as the highest standard deviation were recorded at rural locations.
20.4.1. Confidence intervals (CIs) of the median survey results¶
The upper range of the median survey results from rural location is less than the lower range of the median survey result from urban locations.
# this code was modified from this source:
# http://bebi103.caltech.edu.s3-website-us-east-1.amazonaws.com/2019a/content/recitations/bootstrapping.html
# if you want to get the confidence interval around another point estimate use np.percentile
# and add the percentile value as a parameter
def draw_bs_sample(data):
"""Draw a bootstrap sample from a 1D data set."""
return np.random.choice(data, size=len(data))
def compute_jackknife_reps(data, statfunction=None, stat_param=False):
'''Returns jackknife resampled replicates for the given data and statistical function'''
# Set up empty array to store jackknife replicates
jack_reps = np.empty(len(data))
# For each observation in the dataset, compute the statistical function on the sample
# with that observation removed
for i in range(len(data)):
jack_sample = np.delete(data, i)
if not stat_param:
jack_reps[i] = statfunction(jack_sample)
else:
jack_reps[i] = statfunction(jack_sample, stat_param)
return jack_reps
def compute_a(jack_reps):
'''Returns the acceleration constant a'''
mean = np.mean(jack_reps)
try:
a = sum([(x**-(i+1)- (mean**-(i+1)))**3 for i,x in enumerate(jack_reps)])
b = sum([(x**-(i+1)-mean-(i+1))**2 for i,x in enumerate(jack_reps)])
c = 6*(b**(3/2))
data = a/c
except:
print(mean)
return data
def bootstrap_replicates(data, n_reps=1000, statfunction=None, stat_param=False):
'''Computes n_reps number of bootstrap replicates for given data and statistical function'''
boot_reps = np.empty(n_reps)
for i in range(n_reps):
if not stat_param:
boot_reps[i] = statfunction(draw_bs_sample(data))
else:
boot_reps[i] = statfunction(draw_bs_sample(data), stat_param)
return boot_reps
def compute_z0(data, boot_reps, statfunction=None, stat_param=False):
'''Computes z0 for given data and statistical function'''
if not stat_param:
s = statfunction(data)
else:
s = statfunction(data, stat_param)
return stats.norm.ppf(np.sum(boot_reps < s) / len(boot_reps))
def compute_bca_ci(data, alpha_level, n_reps=1000, statfunction=None, stat_param=False):
'''Returns BCa confidence interval for given data at given alpha level'''
# Compute bootstrap and jackknife replicates
boot_reps = bootstrap_replicates(data, n_reps, statfunction=statfunction, stat_param=stat_param)
jack_reps = compute_jackknife_reps(data, statfunction=statfunction, stat_param=stat_param)
# Compute a and z0
a = compute_a(jack_reps)
z0 = compute_z0(data, boot_reps, statfunction=statfunction, stat_param=stat_param)
# Compute confidence interval indices
alphas = np.array([alpha_level/2., 1-alpha_level/2.])
zs = z0 + stats.norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim)
avals = stats.norm.cdf(z0 + zs/(1-a*zs))
ints = np.round((len(boot_reps)-1)*avals)
ints = np.nan_to_num(ints).astype('int')
# Compute confidence interval
boot_reps = np.sort(boot_reps)
ci_low = boot_reps[ints[0]]
ci_high = boot_reps[ints[1]]
return (ci_low, ci_high)
the_bcas = {}
an_int = 50
# rural cis
r_median = grt_dtr[grt_dtr.rural == 'rural'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'rural'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
r_cis = {'rural':{"lower 2.5%":a_result[0], "median":r_median, "upper 97.5%": a_result[1] }}
the_bcas.update(r_cis)
# urban cis
u_median = grt_dtr[grt_dtr.rural == 'urban'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'urban'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
u_cis = {'urban':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}
the_bcas.update(u_cis)
# all surveys
u_median = grt_dtr[unit_label].median()
a_result = compute_bca_ci(grt_dtr[unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
all_cis = {'all surveys':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}
# combine the results:
the_bcas.update(all_cis)
# make a df
the_cis = pd.DataFrame(the_bcas)
the_cis.reset_index(inplace=True)
fig, axs = plt.subplots()
data = the_cis.values
collabels = the_cis.columns
ut.hide_spines_ticks_grids(axs)
the_first_table_data = axs.table(data, colLabels=collabels, colWidths=[*[.2]*5], bbox=[0, 0, 1, 1])
a_summary_table_one = sut.make_a_summary_table(the_first_table_data,data,collabels, a_color, s_et_bottom_row=True)
a_summary_table_one.get_celld()[(0,0)].get_text().set_text(" ")
plt.show()
plt.close()
The 95% confidence interval of survey totals under urban and rural land use conditions
20.5. Survey results of CG and DG with respect to land use¶
Recall that the most common objects were grouped according to the value of the Spearmans ranked correlation test of p/100m with respect to the land use profile, (annex 1). Creating two groups, one group of objects that has a positive association with percent of land attributed to buildings and one that does not. Cotton swabs are included with the objects that have few or no associations with land use, because they are normally expelled by water treatment plants directly into the nearest body of water.
20.5.1. Assessment of composition¶
The ratio of DG to CG in the rural group was 1.7, in the urban group it was 0.81. On a per survey basis, DG were a greater percent of the total in all surveys from rural locations. In urban locations the ratio of DG to CG is the inverse of the rural locations and for approximately 20% of the surveys in urban locations the ratio of DG to CG is very close to 1.
Sample results from rural locations had a greater portion of trash attributed to fragmented plastics, construction plastics and foams.
# combine the two object groups into one data frame
DG = "DG"
CG = "CG"
dists = grtr_10dtc[(grtr_10dtc.group == DG)][['loc_date', 'location','rural', unit_label]].set_index('loc_date')
conts = grtr_10dtc[(grtr_10dtc.group == CG)][['loc_date', 'location', 'rural', unit_label]].set_index('loc_date')
conts.rename(columns={unit_label:CG}, inplace=True)
dists.rename(columns={unit_label:DG}, inplace=True)
c_v_d = pd.concat([dists, conts], axis=0)
c_v_d['dt'] = c_v_d[DG]/c_v_d[CG]
# the ratio of dist to cont under the two land use conditions
ratio_of_d_c_agg = c_v_d[c_v_d.rural == 'rural'][DG].sum()/c_v_d[c_v_d.rural == 'rural'][CG].sum()
ratio_of_d_c_urb= c_v_d[c_v_d.rural == 'urban'][DG].sum()/c_v_d[c_v_d.rural == 'urban'][CG].sum()
# chart that
fig, ax = plt.subplots(figsize=(6,5))
# get the ecdf of percent of total for each object group under each condition
# p of t urban
co_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.rural == 'urban')&(grtr_10dtc.group.isin([CG]))]["pt"])
di_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.rural == 'urban')&(grtr_10dtc.group.isin([DG]))]["pt"])
# p of t rural
cont_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.rural == 'rural')&(grtr_10dtc.group.isin([CG]))]["pt"])
dist_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.rural == 'rural')&(grtr_10dtc.group.isin([DG]))]["pt"])
sns.lineplot(x=cont_ecdf.x, y=cont_ecdf.y, color='salmon', label="rural: CG", ax=ax)
sns.lineplot(x=co_agecdf.x, y=co_agecdf.y, color='magenta', ax=ax, label="urban: CG")
sns.lineplot(x=dist_ecdf.x, y=dist_ecdf.y, color='teal', label="rural: DG", ax=ax)
sns.lineplot(x=di_agecdf.x, y=di_agecdf.y, color='black', label="urban: DG", ax=ax)
ax.set_xlabel("% of survey total", **ck.xlab_k14)
ax.set_ylabel("% of surveys", **ck.xlab_k14)
plt.legend(loc='lower right', title="% of total")
plt.show()
Difference in composition of rural and urban litter surveys
Under different land use conditions 9/10 of the most common objects are the same in rural and urban settings. There are two exceptions:
plastic construction waste made the top ten in rural settings
industrial pellets made the top ten in urban settings
The most common objects under different land use profiles. Left: rural, right: urban
rur_10 = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)
urb_10 = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)
rur_tot = grtr_10[grtr_10.location.isin(less_than)].quantity.sum()
urb_tot = grtr_10[grtr_10.location.isin(grt_than)].quantity.sum()
rur_10['item'] = rur_10.code.map(lambda x: code_description_map.loc[x])
urb_10['item'] = urb_10.code.map(lambda x: code_description_map.loc[x])
rur_10["% of total"] = ((rur_10.quantity/rur_tot)*100).round(1)
urb_10["% of total"] = ((urb_10.quantity/urb_tot)*100).round(1)
# make tables
fig, axs = plt.subplots(1, 2, figsize=(10,len(most_common)*.8))
# summary table
# names for the table columns
a_col = [top_name[0], 'total']
axone = axs[0]
axtwo = axs[1]
ut.hide_spines_ticks_grids(axone)
ut.hide_spines_ticks_grids(axtwo)
data_one = rur_10[['item', 'quantity', "% of total"]].copy()
data_two = urb_10[['item', 'quantity', "% of total"]].copy()
for a_df in [data_one, data_two]:
a_df['quantity'] = a_df.quantity.map(lambda x: F"{x:,}")
a_table = axone.table(cellText=data_one.values, colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)
a_table = axtwo.table(cellText=data_two.values, colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)
axone.set_xlabel("rural", **ck.xlab_k14)
axtwo.set_xlabel("urban", **ck.xlab_k14)
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
20.5.2. Distribution of survey totals¶
The survey results of the DG are very similar under both land use classes, there is more variance as the reported value increases but not so much that the distributions diverge. Given the standard deviation of the samples and the high variance of beach-litter-survey data in general this is expected. [HG19]
The two sample Kolmogorov-Smirnov(KS) test(ks=0.073, p=0.808) of the two sets of survey results suggest that the survey results of DG may not be significantly different between the two land use classes. The results from the Mann-Whitney U (MWU) (U=11445.0, p=0.762) *suggest that it is possible that the two distributions are the same.**[sc21a] [sc21b]
Empirical Cumulative Distribution (ECDF) of the survey results of DG and CG objects under the different land use classes
left: DG , right: CG
dist_results_agg = grtr_10dtc[(grtr_10dtc.rural == 'rural')&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
dist_results_urb = grtr_10dtc[(grtr_10dtc.rural == 'urban')&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
a_d_ecdf = ECDF(dist_results_agg )
b_d_ecdf = ECDF(dist_results_urb )
cont_results_agg = grtr_10dtc[(grtr_10dtc.rural == 'rural')&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
cont_results_urb = grtr_10dtc[(grtr_10dtc.rural == 'urban')&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
a_d_ecdf_cont = ECDF(cont_results_agg)
b_d_ecdf_cont = ECDF(cont_results_urb)
fig, ax = plt.subplots(1,2, figsize=(8,5))
axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)
axone.legend(fontsize=12, title=DG,title_fontsize=14)
axtwo = ax[1]
sns.lineplot(x=a_d_ecdf_cont.x, y=a_d_ecdf_cont.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_d_ecdf_cont.x, y=b_d_ecdf_cont.y, color='black', label="urban", ax=axtwo)
axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')
axtwo.legend(fontsize=12, title=CG,title_fontsize=14)
plt.show()
On the other hand the CG survey results diverge almost immediately according to the land use class. This test results supports the decision to reject the null hypothesis of the Spearmans ranked correlation test for this group of codes and land use profile. The rural locations have less buildings and more agriculture or woods. The two conditions together should reduce the amount of tobacco and food wrappers for that class.
The KS and MWU tests both agree with the visual results that rural and urban CG survey results most likely come from different distributions and they are not equal (ks=0.284, pvalue<.0001), (U=7559.0, pvalue<.0001).
20.5.2.1. Difference of means¶
The average survey result of DG objects in rural locations was 139.86p/100m in urban locations and 117.75p/100m. A permutation test on the difference of means was conducted on the condition rural - urban.
Difference of means DG objects. \(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000.
# print(stats.ks_2samp(dist_results_agg, dist_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(dist_results_agg,dist_results_urb, alternative='two-sided'))
# #the results for contributed objects
# print(stats.ks_2samp(cont_results_agg, cont_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(cont_results_agg, cont_results_urb, alternative='two-sided'))
# pemutation test: of difference of means HD objects
agg_dobj = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'
# merge into one
objs_merged = agg_dobj.append(buld_obj)
# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()
# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]
# permutation resampling:
for element in np.arange(5000):
objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
b=objs_merged.groupby('new_class').mean()
new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000
# chart the results
fig, ax = plt.subplots()
sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
Refuse to reject the null hypothesis that these two distributions may be the same
The test of the difference of medians supports the observations that DG survey results occur at similar rates indifferent of the land use profile.
20.5.3. Seasonal variations¶
Seasonal variations of beach litter survey results has been documented under many conditions and many environments. In 2018 the SLR [Bla18] reported the maximum value in July and the minimum in November. The year 2020-2021 presents the same results.
# the survey results to test
corr_data = grtr_10[(grtr_10.code.isin(most_common))].copy()
results_sprmns = {}
for i,code in enumerate(most_common):
data = corr_data[corr_data.code == code]
for j, n in enumerate(luse_exp):
corr, a_p = stats.spearmanr(data[n], data[unit_label])
results_sprmns.update({code:{"rho":corr, 'p':a_p}})
# helper dict for converting ints to months
months={
0:'Jan',
1:'Feb',
2:'Mar',
3:'Apr',
4:'May',
5:'Jun',
6:'Jul',
7:'Aug',
8:'Sep',
9:'Oct',
10:'Nov',
11:'Dec'
}
m_dt = grtr_10.groupby(['loc_date', 'date','group'], as_index=False).agg({'quantity':'sum', unit_label:'sum'})
# sample totals all objects
m_dt_t = grtr_10.groupby(['loc_date','date','month', 'eom'], as_index=False).agg({unit_label:'sum'})
m_dt_t.set_index('date', inplace=True)
# data montlhy median survey results contributed, distributed and survey total
fxi=m_dt.set_index('date', drop=True)
data1 = fxi[fxi.group == CG][unit_label].resample("M").mean()
data2 = fxi[fxi.group == DG][unit_label].resample("M").mean()
# helper tool for months in integer order
def new_month(x):
if x <= 11:
this_month = x
else:
this_month=x-12
return this_month
# the monthly average discharge rate of the three rivers where the majority of the samples are
aare_schonau = [61.9, 53, 61.5, 105, 161, 155, 295, 244, 150, 106, 93, 55.2, 74.6, 100, 73.6, 92.1]
rhone_scex = [152, 144, 146, 155, 253, 277, 317, 291, 193, 158, 137, 129, 150, 146, 121, 107]
linth_weesen = [25.3, 50.7, 40.3, 44.3, 64.5, 63.2, 66.2, 61.5, 55.9, 52.5, 35.2, 30.5, 26.1, 42.0, 36.9]
fig, ax = plt.subplots()
this_x = [i for i,x in enumerate(data1.index)]
this_month = [x.month for i,x in enumerate(data1.index)]
twin_ax = ax.twinx()
twin_ax.grid(None)
ax.bar(this_x, data1.to_numpy(), label='contributed', bottom=data2.to_numpy(), linewidth=1, color="salmon", alpha=0.6)
ax.bar([i for i,x in enumerate(data2.index)], data2.to_numpy(), label='distributed', linewidth=1,color="darkslategray", alpha=0.6)
sns.scatterplot(x=this_x,y=[*aare_schonau[2:], np.mean(aare_schonau)], color='turquoise', edgecolor='magenta', linewidth=1, s=60, label='Aare m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*rhone_scex[2:], np.mean(rhone_scex)], color='royalblue', edgecolor='magenta', linewidth=1, s=60, label='Rhône m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*linth_weesen[2:], np.mean(linth_weesen), np.mean(linth_weesen)], color='orange', edgecolor='magenta', linewidth=1, s=60, label='Linth m³/s', ax=twin_ax)
handles, labels = ax.get_legend_handles_labels()
handles2, labels2 = twin_ax.get_legend_handles_labels()
ax.xaxis.set_major_locator(ticker.FixedLocator([i for i in np.arange(len(this_x))]))
ax.set_ylabel(unit_label, **ck.xlab_k14)
twin_ax.set_ylabel("m³/sec", **ck.xlab_k14)
axisticks = ax.get_xticks()
labelsx = [months[new_month(x-1)] for x in this_month]
plt.xticks(ticks=axisticks, labels=labelsx)
plt.legend([*handles, *handles2], [*labels, *labels2], bbox_to_anchor=(0,-.1), loc='upper left', fontsize=14)
plt.show()
monthly survey results and river discharge rates m³/second
April and May 2021 are rolling averages, data not available
source : https://www.hydrodaten.admin.ch/en/stations-and-data.html?entrance_medium=menu
20.6. The survey results of FP and FT with respect to land use¶
Results of KS test and Mann Whitney U
The survey results for FP objects is very similar up to \(\approx\) the 85th percentile where the rural survey results are noticeably larger. Suggesting that extreme values for FP were more likely in rural locations. According to the KS test (ks=0.78, pvalue=0.69) and MWU test (U=10624, pvalue=0.40) the distribution of FP objects under the two land use classes is not significantly different and may be equal.
The survey results for FT objects maintain the same features as the parent distribution. The results of the KS test (ks=0.29, pvalue<.001) and MWU test (U=7356.5, p<.001) agree with the results of the parent group, that there is a statistically relevant difference between the survey results under different land use classes.
Left rural - urban: ECDF of survey results fragmented plastics and foams (FP)
Right rural - urban: ECDF of survey results cigarette ends and candy wrappers (FT)
agg_dobj = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values
buld_obj = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values
a_d_ecdf = ECDF(agg_dobj)
b_d_ecdf = ECDF(buld_obj)
agg_cont = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values
b_cont = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values
a_c_ecdf = ECDF(agg_cont)
b_c_ecdf = ECDF(b_cont)
fig, ax = plt.subplots(1,2, figsize=(8,5))
axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)
axone.legend(fontsize=12, title='FP',title_fontsize=14)
axtwo = ax[1]
sns.lineplot(x=a_c_ecdf.x, y=a_c_ecdf.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_c_ecdf.x, y=b_c_ecdf.y, color='black', label="urban", ax=axtwo)
axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')
axtwo.legend(fontsize=12, title='FT',title_fontsize=14)
plt.tight_layout()
plt.show()
20.6.1. FP and FT difference of means.¶
The average survey result of FP objects in rural locations was 22.93p/50m in urban locations it was 12p/50m. A permutation test on the difference of means was conducted on the condition rural - urban.
Difference of means fragmented foams and plastics under the two different land use classes.
\(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000
# pemutation test: of difference of means FP objects
agg_dobj = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'
# merge into one
objs_merged = agg_dobj.append(buld_obj)
# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()
# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]
# permutation resampling:
for element in np.arange(5000):
objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
b=objs_merged.groupby('new_class').mean()
new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000
# chart the results
fig, ax = plt.subplots()
sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
Refuse to reject the null hypotheses: there is no statistically significant difference between the two distributions
Difference of means cigarette ends and snack wrappers under the two different land use classes.
\(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000
# pemutation test: of difference of means food objects
agg_cont = grtr_10[(grtr_10.rural == 'rural')&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
b_cont = grtr_10[(grtr_10.rural == 'urban')&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_cont['class'] = 'rural'
b_cont['class'] = 'urban'
# merge into one
objs_merged = agg_cont.append(b_cont)
# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()
# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
# permutation resampling:
for element in np.arange(5000):
objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
b=objs_merged.groupby('new_class').mean()
new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000
# chart the results
fig, ax = plt.subplots()
sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
Reject the null hypothesis: the two distributions are most likely not the same
20.7. Discussion¶
A positive statistically relevant association between CG objects and land use attributed to infrastructure such as streets, recreation areas and buildings can be assumed. Representing 4/12 most common objects, they are \(\approx\) 28% of all objects found and can be associated to activities within 1500m of the survey location.
In contrast the DG group has an \(\approx\) distribution under the different land use classes and no association to percent of land attributed to buildings. Composed of construction plastics, fragmented foams and plastics and industrial pellets the DG represents a diverse group of objects with different densities. With no statistical evidence to the contrary the null hypothesis cannot be rejected.
Changes in the quantities of DG objects are not directly related to the measured land use characteristics and are \(\approx\) under all land use classes. Therefore it can not be assumed that the primary source was within 1500m of the survey location and that a portion of these objects have origins upstream (economically and geographically).
Reducing the quantities of DG objects in the environment will require coordination across municipal and cantonal boundaries given the transport method under consideration. Fortunately this coordination has been implemented since 2015 in Switzerland by various associations and institutions at many levels. However, the definition of objective criteria is yet to be established officially.
20.7.1. Collecting objective data¶
The purpose of collecting objective data is to find partners.
Establishing objective criteria. Identifying and counting the objects after a litter survey
output = io.BytesIO()
this_image = PILImage.open("resources/codegroups/images/takingnotes.jpg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
The data is then entered into the application by smartphone or pc
The requirement of objective criteria is a way of ensuring that the principle of causality is applied appropriately. Beach litter survey results provide the basic elements of objective criteria:
The GPS coordinates where the objects were found
The date the objects were found
The quantity of objects found
A description of the object and its uses in the economy
The material type of the object
The dimensions of the area surveyed
There are limits to the information that can be extracted from beach-litter surveys given the current protocol. These limits are related to the costs to complete one survey and the resulting number of samples per sampling period.
Care must be taken to maintain objectivity throughout the process. The best way to estimate the current state is having as many proper samples as possible within the region and sampling period of study.
Currently the protocol allows for a proven, repeatable, broad-based assessment with minimal financial burden and almost no fixed costs. Thus allowing all stakeholders a chance to create partnerships and assess performance at different administrative and regional levels.
20.7.2. Creating partnerships¶
The methods used to establish objective criteria should also be used in all benchmark and baseline calculations. Having benchmarks and goals are essential in establishing meaning full partnerships and determining return on investment
20.7.2.1. Finding partners¶
The results from the test indicate that CG objects are more prevalent in urban locations. Urban was defined as the land use within 1500m of the survey area. From this it is safe to assume that the cause(s) of CG group litter are also more prevalent in urban areas and that the secondary cause of the litter is within 1500m of the survey location.
Stakeholders looking to reduce the incidence of CG objects within a specific zone may have a better chance of finding motivated partners within 1500m of the location of concern.
The DG group has the particularity that it is distributed in \(\approx\) rates indifferent of the land use and it makes up a larger proportion of the objects found than CG. This implies that the solution is at a larger scale than the municipal boundaries.
Fragmented plastics is the only DG object on the list that cannot be attributed to at least one industry that is present in all the survey areas covered by this analysis.
Expanded polystyrene is used as an exterior insulation envelope in the construction industry and is also used in packaging to protect fragile objects during transport.
Plastic production pellets are used to make plastic objects in the injection molding process.
Cotton swabs are often diverted to rivers and lakes after passing through a water treatment plant.
Industrial sheeting is used in horticulture, transport and the construction industry.
Construction plastics
Finding partners for these objects may involve an initial phase of informative targeted communication that calls attention to these study results, and current EU thresholds and baselines for beach litter [HG19].
20.7.2.2. Individual items¶
In cases such as the plastic production pellet (GPI), the use case of the item is definite and users and producers are relatively rare with respect to other litter items. Therefore the origin can be determined by searching for consumers of that product within 1500m of the survey location and then progressing upstream. This can be done by locating the survey locations on a map and then overlaying the locations of likely consumers or producers of the object in question.
A causal relationship could be inferred from the image below but not assumed. GPI are small and difficult to clean up once they have been spilled making the exact source impossible to determine. However it is reasonable to assume that the handlers and consumers of GPI will have the best ideas on how to prevent them from escaping into the environment.
output = io.BytesIO()
bassin_map = PILImage.open("resources/images/causality.jpeg")
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
Identifying secondary sources of specific litter items. Consumers or handlers of plastic production pellets and probable fluvial route to survey location. Venoge and Thiele rivers.
Enterprise names withheld
This process can be repeated for any number of distinct items. Currently there are unique codes for various items that can be positively identified and have been found multiple times:
Pheromone baits for vineyards
Agricultural fencing
Electrical tape
Cable ties
String trimmer line
20.7.2.3. Sharing the responsibility¶
The principle of Extended Producer Responsibility (EPR) may provide the incentive to producers and consumers to account for the real costs of end-of-life management for the most common litter items identified in Switzerland. [Pou20]
A recent study in the Marine Policy journal identified several limitations of using preexisting beach litter survey data to assess the impact of EPR policy on observed litter quantities. [HLCM21]
Limited data
Heterogeneous methods
Data not collected for the purposes of evaluating ERP
To correct these limitations the authors provide the following recommendations:
Create a data framework specifically for monitoring ERP targets
Identify sources
Use litter counts to establish baselines
Frequent monitoring
Litter counts mitigate the effects of light-weight packaging when collection results are based on weights. [HLCM21]
The IQAASl project addresses three out of the four recommendations and it introduced a method that allows stakeholders to add specific items to the survey protocol. Thus monitoring progress towards ERP targets can be implemented as long as the objects can be defined visually and can be counted.
The current database of beach-litter surveys in Switzerland includes over 1,500 samples collected using the same protocol with two distinct sampling periods of twelve months. Switzerland has all the elements in place to accurately estimate minimum probable values for the most common objects and evaluate stochastic changes on a monthly interval.
This report offers several different ways to evaluate differences between survey results, there are certainly others that should be considered. There are many improvements to be made concerning the national strategy:
Defining a standardized reporting method for municipal, cantonal and federal stakeholders
Define monitoring or assessment goals
Formalize the data repository and the method for implementation at different administrative levels
Develop a network of associations that share the responsibility and resources for surveying the territory
Develop and implement a formal training program for surveyors that includes data science, and GIS technologies
Determine, in collaboration with academic partners, ideal sampling scenarios and research needs
Develop a financing method to ensure that enough samples are collected each year in each survey area so that accurate assessments can be made and research requirements are met.
20.8. Annex¶
20.8.1. IQAASL surveyors¶
Hammerdirt staff:
Shannon Erismann, field operations manager
Helen Kurukulasuriya, surveyor
Débora Carmo, surveyor
Bettina Siegenthaler, surveyor
Roger Erismann, surveyor
Participating organizations:
Precious plastic leman
Association pour la sauvetage du Léman
Geneva international School
Solid waste management: École polytechnique fédéral Lausanne